library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
train <- read.csv("train.csv", stringsAsFactors = TRUE)
train <- train %>%
mutate(Alley = as.character(Alley),
Alley = replace_na(Alley, "None"),
Alley = as.factor(Alley)) %>%
mutate(TotalSF = X1stFlrSF + X2ndFlrSF + TotalBsmtSF,
RichNbrhd = case_when(Neighborhood %in% c("StoneBr", "NridgHt", "NoRidge") ~ 1,
TRUE ~ 0))
plot(SalePrice ~ Neighborhood, data=train, las=2)

lm1 <- lm(SalePrice ~ Neighborhood, data=train)
summary(lm1)
##
## Call:
## lm(formula = SalePrice ~ Neighborhood, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -162271 -27552 -5324 19685 419705
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 194871 13097 14.879 < 2e-16 ***
## NeighborhoodBlueste -57371 40367 -1.421 0.155463
## NeighborhoodBrDale -90377 18809 -4.805 1.71e-06 ***
## NeighborhoodBrkSide -70037 14893 -4.703 2.81e-06 ***
## NeighborhoodClearCr 17695 16603 1.066 0.286721
## NeighborhoodCollgCr 3095 13819 0.224 0.822820
## NeighborhoodCrawfor 15754 15123 1.042 0.297712
## NeighborhoodEdwards -66651 14166 -4.705 2.78e-06 ***
## NeighborhoodGilbert -2016 14437 -0.140 0.888944
## NeighborhoodIDOTRR -94747 15822 -5.988 2.67e-09 ***
## NeighborhoodMeadowV -96294 18522 -5.199 2.29e-07 ***
## NeighborhoodMitchel -38601 15200 -2.540 0.011204 *
## NeighborhoodNAmes -49024 13582 -3.609 0.000318 ***
## NeighborhoodNoRidge 140424 15577 9.015 < 2e-16 ***
## NeighborhoodNPkVill -52176 22260 -2.344 0.019217 *
## NeighborhoodNridgHt 121400 14470 8.390 < 2e-16 ***
## NeighborhoodNWAmes -5821 14542 -0.400 0.689011
## NeighborhoodOldTown -66646 14047 -4.744 2.30e-06 ***
## NeighborhoodSawyer -58078 14523 -3.999 6.69e-05 ***
## NeighborhoodSawyerW -8315 14864 -0.559 0.575974
## NeighborhoodSomerst 30509 14333 2.129 0.033456 *
## NeighborhoodStoneBr 115628 16975 6.812 1.42e-11 ***
## NeighborhoodSWISU -52280 16975 -3.080 0.002111 **
## NeighborhoodTimber 47377 15756 3.007 0.002686 **
## NeighborhoodVeenker 43902 20895 2.101 0.035810 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54000 on 1435 degrees of freedom
## Multiple R-squared: 0.5456, Adjusted R-squared: 0.538
## F-statistic: 71.78 on 24 and 1435 DF, p-value: < 2.2e-16
lm.1stflr <- lm(SalePrice ~ X1stFlrSF, data=train)
summary(lm.1stflr)
##
## Call:
## lm(formula = SalePrice ~ X1stFlrSF, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -460330 -36494 -13164 36291 414547
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36173.447 5245.728 6.896 7.95e-12 ***
## X1stFlrSF 124.501 4.282 29.078 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 63220 on 1458 degrees of freedom
## Multiple R-squared: 0.3671, Adjusted R-squared: 0.3666
## F-statistic: 845.5 on 1 and 1458 DF, p-value: < 2.2e-16
plot(SalePrice ~ X1stFlrSF, data=train)

lm.alley <- lm(SalePrice ~ Alley, data=train)
summary(lm.alley)
##
## Call:
## lm(formula = SalePrice ~ Alley, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -148552 -50952 -15636 31581 571548
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 122219 11127 10.984 < 2e-16 ***
## AlleyNone 61233 11329 5.405 7.56e-08 ***
## AlleyPave 45782 16577 2.762 0.00582 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 78680 on 1457 degrees of freedom
## Multiple R-squared: 0.02041, Adjusted R-squared: 0.01906
## F-statistic: 15.18 on 2 and 1457 DF, p-value: 2.996e-07
plot(SalePrice ~ Alley, data=train)

table(train$Alley)
##
## Grvl None Pave
## 50 1369 41
View(train[,c("SalePrice","Alley")])
lm.fence <- lm(SalePrice ~ Fence, data=train)
summary(lm.fence)
##
## Call:
## lm(formula = SalePrice ~ Fence, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -108751 -27927 -9751 10714 596249
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 178928 7921 22.590 < 2e-16 ***
## FenceGdWo -38548 11458 -3.364 0.000876 ***
## FenceMnPrv -30176 9291 -3.248 0.001305 **
## FenceMnWw -44641 19981 -2.234 0.026270 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 60840 on 277 degrees of freedom
## (1179 observations deleted due to missingness)
## Multiple R-squared: 0.05086, Adjusted R-squared: 0.04058
## F-statistic: 4.948 on 3 and 277 DF, p-value: 0.002313
apply(train, 2, function(x) sum(is.na(x))) #count missing values
## Id MSSubClass MSZoning LotFrontage LotArea
## 0 0 0 259 0
## Street Alley LotShape LandContour Utilities
## 0 0 0 0 0
## LotConfig LandSlope Neighborhood Condition1 Condition2
## 0 0 0 0 0
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## 0 0 0 0 0
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## 0 0 0 0 0
## MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 8 8 0 0 0
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 37 37 38 37 0
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## 38 0 0 0 0
## HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## 0 0 1 0 0
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 0 0 0 0 0
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 0 0 0 0 0
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## 0 0 690 81 81
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## 81 0 0 81 81
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 0 0 0 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## 0 0 1453 1179 1406
## MiscVal MoSold YrSold SaleType SaleCondition
## 0 0 0 0 0
## SalePrice TotalSF RichNbrhd
## 0 0 0
lm.1stflr <- lm(SalePrice ~ X1stFlrSF, data=train)
summary(lm.1stflr)
##
## Call:
## lm(formula = SalePrice ~ X1stFlrSF, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -460330 -36494 -13164 36291 414547
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36173.447 5245.728 6.896 7.95e-12 ***
## X1stFlrSF 124.501 4.282 29.078 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 63220 on 1458 degrees of freedom
## Multiple R-squared: 0.3671, Adjusted R-squared: 0.3666
## F-statistic: 845.5 on 1 and 1458 DF, p-value: < 2.2e-16
lm.2ndflr <- lm(SalePrice ~ X2ndFlrSF, data=train)
summary(lm.2ndflr)
##
## Call:
## lm(formula = SalePrice ~ X2ndFlrSF, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -159576 -45756 -17756 27144 485454
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.608e+05 2.518e+03 63.84 <2e-16 ***
## X2ndFlrSF 5.812e+01 4.517e+00 12.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75310 on 1458 degrees of freedom
## Multiple R-squared: 0.102, Adjusted R-squared: 0.1014
## F-statistic: 165.6 on 1 and 1458 DF, p-value: < 2.2e-16
lm.basement <- lm(SalePrice ~ TotalBsmtSF, data=train)
summary(lm.basement)
##
## Call:
## lm(formula = SalePrice ~ TotalBsmtSF, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -582310 -39612 -14095 33315 420018
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63430.629 4286.892 14.80 <2e-16 ***
## TotalBsmtSF 111.110 3.745 29.67 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 62750 on 1458 degrees of freedom
## Multiple R-squared: 0.3765, Adjusted R-squared: 0.3761
## F-statistic: 880.3 on 1 and 1458 DF, p-value: < 2.2e-16
#Put them all together into a high dimensional multiple regression model
lm.sqft.all <- lm(SalePrice ~ X1stFlrSF + X2ndFlrSF + TotalBsmtSF, data=train)
summary(lm.sqft.all)
##
## Call:
## lm(formula = SalePrice ~ X1stFlrSF + X2ndFlrSF + TotalBsmtSF,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -697622 -21631 -366 20427 276895
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -14033.593 4492.048 -3.124 0.00182 **
## X1stFlrSF 81.786 5.861 13.955 < 2e-16 ***
## X2ndFlrSF 84.493 3.021 27.973 < 2e-16 ***
## TotalBsmtSF 66.718 5.136 12.990 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 49310 on 1456 degrees of freedom
## Multiple R-squared: 0.6155, Adjusted R-squared: 0.6147
## F-statistic: 776.8 on 3 and 1456 DF, p-value: < 2.2e-16
#Or use mutate and create a new "TotalSF" variable that allows for a simple linear regression model that is just as powerful, but far easier to graph and interpret.
train <- train %>%
mutate(TotalSF = X1stFlrSF + X2ndFlrSF + TotalBsmtSF)
lm.sqft <- lm(SalePrice ~ TotalSF, data=train)
summary(lm.sqft)
##
## Call:
## lm(formula = SalePrice ~ TotalSF, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -715562 -22190 -669 20711 269879
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -13219.802 4251.189 -3.11 0.00191 **
## TotalSF 75.628 1.577 47.95 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 49510 on 1458 degrees of freedom
## Multiple R-squared: 0.6119, Adjusted R-squared: 0.6117
## F-statistic: 2299 on 1 and 1458 DF, p-value: < 2.2e-16
plot(SalePrice ~ TotalSF, data=train)

lm.sqft.rich <- lm(SalePrice ~ TotalSF + RichNbrhd + TotalSF:RichNbrhd, data=train)
summary(lm.sqft.rich)
##
## Call:
## lm(formula = SalePrice ~ TotalSF + RichNbrhd + TotalSF:RichNbrhd,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -532130 -21090 -866 20892 213493
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.704e+04 4.078e+03 6.630 4.73e-11 ***
## TotalSF 5.659e+01 1.594e+00 35.504 < 2e-16 ***
## RichNbrhd -1.207e+05 1.695e+04 -7.122 1.66e-12 ***
## TotalSF:RichNbrhd 5.735e+01 4.696e+00 12.212 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42500 on 1456 degrees of freedom
## Multiple R-squared: 0.7143, Adjusted R-squared: 0.7137
## F-statistic: 1214 on 3 and 1456 DF, p-value: < 2.2e-16
lm.sqft.rich.log <- lm(log(SalePrice) ~ TotalSF + RichNbrhd + TotalSF:RichNbrhd, data=train)
summary(lm.sqft.rich)
##
## Call:
## lm(formula = SalePrice ~ TotalSF + RichNbrhd + TotalSF:RichNbrhd,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -532130 -21090 -866 20892 213493
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.704e+04 4.078e+03 6.630 4.73e-11 ***
## TotalSF 5.659e+01 1.594e+00 35.504 < 2e-16 ***
## RichNbrhd -1.207e+05 1.695e+04 -7.122 1.66e-12 ***
## TotalSF:RichNbrhd 5.735e+01 4.696e+00 12.212 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42500 on 1456 degrees of freedom
## Multiple R-squared: 0.7143, Adjusted R-squared: 0.7137
## F-statistic: 1214 on 3 and 1456 DF, p-value: < 2.2e-16
exp(coef(lm.sqft.rich.log))
## (Intercept) TotalSF RichNbrhd TotalSF:RichNbrhd
## 6.837410e+04 1.000337e+00 1.374876e+00 9.999872e-01
#are the multipliers for the increase in average Sale Price.
# So 1.000337 means each square foot makes the predicted value 1.000337 times as large. Or, 1,000 square feet of addition makes the home 40% greater in value.
exp(coef(lm.sqft.rich.log)[2]*1000)
## TotalSF
## 1.40051
house3d <- lm(SalePrice ~ TotalSF + LotArea + TotalSF:LotArea, data=train)
summary(house3d)
##
## Call:
## lm(formula = SalePrice ~ TotalSF + LotArea + TotalSF:LotArea,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -262819 -22740 638 22608 280521
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.981e+04 5.702e+03 -12.24 <2e-16 ***
## TotalSF 9.266e+01 2.018e+00 45.91 <2e-16 ***
## LotArea 4.799e+00 3.551e-01 13.52 <2e-16 ***
## TotalSF:LotArea -1.274e-03 9.184e-05 -13.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46520 on 1456 degrees of freedom
## Multiple R-squared: 0.6578, Adjusted R-squared: 0.657
## F-statistic: 932.8 on 3 and 1456 DF, p-value: < 2.2e-16
## For houses with a lot area of 1300 sf, each additional sf of the house (TotalSF) adds b[2]+b[4]*1300
## or $91 to the predicted value of the home.
## For houses with a lot area of 215425 sf, each additional sf of the house (TotalSF) drops the predicted value by $181.74, b[2]+b[4]*215425
## To embed the 3d-scatterplot inside of your html document is harder.
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
#Setup Axis
axis_x <- seq(min(train$TotalSF), max(train$TotalSF), length.out=100)
axis_y <- seq(min(train$LotArea), max(train$LotArea), length.out=100)
#Sample points
air_surface <- expand.grid(TotalSF = axis_x, LotArea = axis_y, KEEP.OUT.ATTRS=F)
air_surface$Z <- predict.lm(house3d, newdata = air_surface)
air_surface <- acast(air_surface, LotArea ~ TotalSF, value.var = "Z") #y ~ x
#Create scatterplot
plot_ly(train,
x = ~TotalSF,
y = ~LotArea,
z = ~SalePrice,
type = "scatter3d",
mode = "markers") %>%
add_trace(z = air_surface,
x = axis_x,
y = axis_y,
type = "surface")
## Warning: 'surface' objects don't have these attributes: 'mode'
## Valid attributes include:
## '_deprecated', 'autocolorscale', 'cauto', 'cmax', 'cmid', 'cmin', 'coloraxis', 'colorbar', 'colorscale', 'connectgaps', 'contours', 'customdata', 'customdatasrc', 'hidesurface', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'lighting', 'lightposition', 'meta', 'metasrc', 'name', 'opacity', 'opacityscale', 'reversescale', 'scene', 'showlegend', 'showscale', 'stream', 'surfacecolor', 'surfacecolorsrc', 'text', 'textsrc', 'type', 'uid', 'uirevision', 'visible', 'x', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'ycalendar', 'yhoverformat', 'ysrc', 'z', 'zcalendar', 'zhoverformat', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
plot(SalePrice ~ TotalSF, data=train, ylim=c(-1000000,1000000))
b <- coef(house3d)
b
## (Intercept) TotalSF LotArea TotalSF:LotArea
## -6.981395e+04 9.266018e+01 4.798759e+00 -1.273770e-03
drawit <- function(LotArea, col=col){
curve(b[1] + b[2]*TotalSF + b[3]*LotArea + b[4]*TotalSF*LotArea, add=TRUE, col=col, xname="TotalSF")
}
for (la in axis_y){
drawit(la, col="red")
}
drawit(1300, col="red")
drawit(215245, col="blue")

househd <- lm(SalePrice ~ TotalSF + LotArea + GarageArea + Alley + FullBath + ScreenPorch, data=train)
summary(househd)
##
## Call:
## lm(formula = SalePrice ~ TotalSF + LotArea + GarageArea + Alley +
## FullBath + ScreenPorch, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -612807 -18993 -1339 17487 288849
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.564e+04 7.237e+03 -7.688 2.73e-14 ***
## TotalSF 5.262e+01 2.045e+00 25.732 < 2e-16 ***
## LotArea 2.399e-01 1.251e-01 1.918 0.055308 .
## GarageArea 9.187e+01 6.756e+00 13.598 < 2e-16 ***
## AlleyNone 2.431e+04 6.546e+03 3.714 0.000211 ***
## AlleyPave 1.800e+04 9.573e+03 1.880 0.060323 .
## FullBath 2.004e+04 2.669e+03 7.510 1.03e-13 ***
## ScreenPorch 5.459e+01 2.142e+01 2.549 0.010920 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45170 on 1452 degrees of freedom
## Multiple R-squared: 0.6782, Adjusted R-squared: 0.6767
## F-statistic: 437.2 on 7 and 1452 DF, p-value: < 2.2e-16
set.seed(121)
num_rows <- 1000 #1460 total
keep <- sample(1:nrow(train), num_rows)
mytrain <- train[keep, ] #Use this in the lm(..., data=mytrain) it is like "rbdata"
mytest <- train[-keep, ] #Use this in the predict(..., newdata=mytest) it is like "rbdata2"
househd <- lm(SalePrice ~ TotalSF + LotArea + GarageArea + Alley + FullBath + ScreenPorch, data=mytrain)
summary(househd)
##
## Call:
## lm(formula = SalePrice ~ TotalSF + LotArea + GarageArea + Alley +
## FullBath + ScreenPorch, data = mytrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -375747 -19694 -1284 20342 260709
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.604e+04 7.950e+03 -8.307 3.20e-16 ***
## TotalSF 6.043e+01 2.438e+00 24.787 < 2e-16 ***
## LotArea 1.975e-01 1.786e-01 1.106 0.26908
## GarageArea 1.027e+02 8.024e+00 12.799 < 2e-16 ***
## AlleyNone 2.843e+04 7.045e+03 4.036 5.87e-05 ***
## AlleyPave 2.522e+04 1.071e+04 2.354 0.01874 *
## FullBath 9.066e+03 3.152e+03 2.876 0.00411 **
## ScreenPorch 5.255e+01 2.607e+01 2.016 0.04407 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42900 on 992 degrees of freedom
## Multiple R-squared: 0.7228, Adjusted R-squared: 0.7209
## F-statistic: 369.5 on 7 and 992 DF, p-value: < 2.2e-16
house3d <- lm(SalePrice ~ TotalSF + LotArea + TotalSF:LotArea, data=mytrain)
yh_hd <- predict(househd, newdata=mytest)
yh_3d <- predict(house3d, newdata=mytest)
ybar <- mean(mytest$SalePrice)
SSTO <- sum( (mytest$SalePrice - ybar)^2 )
SSE_hd <- sum( (mytest$SalePrice - yh_hd)^2 )
SSE_3d <- sum( (mytest$SalePrice - yh_3d)^2 )
rs_hd <- 1 - SSE_hd/SSTO
rs_3d <- 1 - SSE_3d/SSTO
n <- nrow(mytest)
p_3d <- length(house3d)
p_hd <- length(househd)
rsa_hd <- 1 - (n-1)/(n-p_hd)*SSE_hd/SSTO
rsa_3d <- 1 - (n-1)/(n-p_3d)*SSE_3d/SSTO
rsa_hd
## [1] 0.5286034
summary(househd)
##
## Call:
## lm(formula = SalePrice ~ TotalSF + LotArea + GarageArea + Alley +
## FullBath + ScreenPorch, data = mytrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -375747 -19694 -1284 20342 260709
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.604e+04 7.950e+03 -8.307 3.20e-16 ***
## TotalSF 6.043e+01 2.438e+00 24.787 < 2e-16 ***
## LotArea 1.975e-01 1.786e-01 1.106 0.26908
## GarageArea 1.027e+02 8.024e+00 12.799 < 2e-16 ***
## AlleyNone 2.843e+04 7.045e+03 4.036 5.87e-05 ***
## AlleyPave 2.522e+04 1.071e+04 2.354 0.01874 *
## FullBath 9.066e+03 3.152e+03 2.876 0.00411 **
## ScreenPorch 5.255e+01 2.607e+01 2.016 0.04407 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42900 on 992 degrees of freedom
## Multiple R-squared: 0.7228, Adjusted R-squared: 0.7209
## F-statistic: 369.5 on 7 and 992 DF, p-value: < 2.2e-16
rsa_3d
## [1] 0.5761344
summary(house3d)
##
## Call:
## lm(formula = SalePrice ~ TotalSF + LotArea + TotalSF:LotArea,
## data = mytrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -347259 -22001 506 21638 257755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.900e+04 8.896e+03 -5.508 4.61e-08 ***
## TotalSF 8.750e+01 3.120e+00 28.040 < 2e-16 ***
## LotArea 2.339e+00 7.516e-01 3.113 0.00191 **
## TotalSF:LotArea -6.272e-04 2.235e-04 -2.806 0.00511 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46890 on 996 degrees of freedom
## Multiple R-squared: 0.6675, Adjusted R-squared: 0.6665
## F-statistic: 666.6 on 3 and 996 DF, p-value: < 2.2e-16